import numpy as np
import matplotlib.pyplot as plt
from matplotlib import colors
from mpl_toolkits.axes_grid1 import make_axes_locatable, axes_size
import os, sys
from scipy.signal import find_peaks
from scipy.ndimage import gaussian_filter as gf
from tools import *
counts={'EEP':np.genfromtxt('txt_HiCool/EEP_merged.txt'),\
'HilD':np.genfromtxt('txt_HiCool/DhilD_ESP_merged.txt'),\
'MKL82_OFF':np.genfromtxt('txt_HiCool/MKL82_Merged_bis.txt'),\
'MKL81_ON':np.genfromtxt('txt_HiCool/MKL81_Merged_bis.txt'),\
'MKL41':np.genfromtxt('txt_HiCool/MKL41_Merged.txt'),\
'MKL42':np.genfromtxt('txt_HiCool/MKL42_Merged.txt')}
cumul_counts = {batch:np.sum(counts[batch],axis=0) for batch in counts}
#print(cumul_counts)
nb_interacting_bins = {batch:np.sum(counts[batch]>0,axis=0) for batch in counts}
results_fo = './myfigures_FI_final_Gaussian2/'
if not os.path.exists(results_fo): os.mkdir(results_fo)
fooo=results_fo+'interacting_bins/'
if not os.path.exists(fooo): os.mkdir(fooo)
min_intearcting_bins = {}
for batch in counts:
plt.figure()
plt.hist(nb_interacting_bins[batch],bins=np.arange(np.max(nb_interacting_bins[batch])+1));
index_maxa = np.argwhere(nb_interacting_bins[batch] == np.max(nb_interacting_bins[batch]))[0]
hist, bin_edges= np.histogram(nb_interacting_bins[batch],bins=np.arange(np.max(nb_interacting_bins[batch])+1))
idx_max = np.argwhere(hist == np.max(hist)).flatten()[0]
#min_intearcting_bins[batch] = bin_edges[np.argwhere(hist[:idx_max]<=1)[-1]+1]
min_intearcting_bins[batch] = int(0.8 * bin_edges[idx_max])
plt.axvline(min_intearcting_bins[batch],color='red')
plt.xlabel('number of interacting bins')
plt.ylabel('number of bins')
plt.title(batch)
plt.tight_layout()
plt.savefig(results_fo+'interacting_bins/nb_interacting_bins_%s.pdf'%batch)
frac_mapped = 0.8
mapped_idx = {batch:np.argwhere(nb_interacting_bins[batch]>=min_intearcting_bins[batch]).flatten() for batch in nb_interacting_bins}
#mininum positive values to compute pseudo_counts
min_positive_counts={t:np.min(counts[t][counts[t]>0]) for t in counts}
max_counts={t:np.max(counts[t]) for t in counts}
nbins = {batch:counts[batch].shape[0] for batch in counts}
print(nbins)
{'EEP': 4879, 'HilD': 5069, 'MKL82_OFF': 5069, 'MKL81_ON': 5069, 'MKL41': 5069, 'MKL42': 5069}
new2ref_idx = {}
for batch in mapped_idx:
new2ref_idx[batch] = {}
i = -1
for i0 in range(nbins[batch]):
if i0 in mapped_idx[batch]:
i+=1
new2ref_idx[batch][i]=i0
print({batch:len(mapped_idx[batch])/counts[batch].shape[0] for batch in counts})
{'EEP': 1.0, 'HilD': 1.0, 'MKL82_OFF': 1.0, 'MKL81_ON': 1.0, 'MKL41': 1.0, 'MKL42': 1.0}
fooo=results_fo+'only_mapped_bins/matrices/'
if not os.path.exists(fooo): os.makedirs(fooo,exist_ok=True)
for batch in counts:
np.savetxt(results_fo+'only_mapped_bins/matrices/'+batch+'_matrix.txt', counts[batch][np.ix_(mapped_idx[batch],mapped_idx[batch])])
with open(results_fo+'only_mapped_bins/matrices/'+batch+'_bins_correspondence.txt','w') as out:
out.write('new_bin\tref_bin\n')
for inew in sorted(new2ref_idx[batch].keys()):
out.write('%d\t%d\n'%(inew+1, new2ref_idx[batch][inew]+1))
corrected_counts = {batch: counts[batch][np.ix_(mapped_idx[batch],mapped_idx[batch])] for batch in counts}
fooo=results_fo+'only_mapped_bins/plots/'
if not os.path.exists(fooo): os.mkdir(fooo)
plt.figure(figsize=(8,8))
for batch in counts:
plt.imshow(corrected_counts[batch], norm=colors.SymLogNorm(0.001, base=np.e), cmap='bone_r')
plt.title(batch, fontsize = 25)
ticks = np.arange(0, len(corrected_counts[batch]), 50)
plt.xticks(ticks, ['%d/%d'%(t,new2ref_idx[batch][t]) for t in ticks], rotation = -90)
plt.yticks(ticks, ['%d/%d'%(t,new2ref_idx[batch][t]) for t in ticks])
plt.tight_layout()
# plt.savefig(results_fo+'only_mapped_bins/plots/'+batch+'.pdf', transparent=True)
gaussian_width=2 #to mitigate noise in generate_frontiers
domain_fo=results_fo+'domains_gw=%.1f/'%(gaussian_width)
if not os.path.exists(domain_fo): os.mkdir(domain_fo)
#mininum positive values to compute pseudo_counts
min_positive_counts={t:np.min(corrected_counts[t][corrected_counts[t]>0]) for t in corrected_counts}
max_counts={t:np.max(corrected_counts[t]) for t in corrected_counts}
print(max_counts)
{'EEP': 0.15619399, 'HilD': 0.073081697, 'MKL82_OFF': 0.069879013, 'MKL81_ON': 0.069170502, 'MKL41': 0.052661108, 'MKL42': 0.067896998}
#pseudo_counts={t:min_positive_counts[t]/2 for t in max_counts}
pseudo_counts={t:max_counts[t]/20 for t in max_counts}
frontiers_matrix = {t:generate_frontiers(np.log(corrected_counts[t]+pseudo_counts[t]),w=gaussian_width) for t in counts}
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['figure.dpi'] = 150
#plt.rcParams['figure.figsize'] = 5, 4
plt.rcParams['font.size'] = 8
fig, axes = plt.subplots(figsize=(6, 12), ncols=2, nrows=6, tight_layout=True)
for i,t in enumerate(['EEP', 'HilD', 'MKL82_OFF','MKL81_ON', 'MKL41', 'MKL42']):
axes[i][0].imshow(
corrected_counts[t],
norm=colors.LogNorm(vmin=min_positive_counts[t], vmax=1.5*max_counts[t]),
cmap='BuPu')
axes[i][0].set_title('HiC heatmap - %s'%t)
axes[i][1].imshow(
frontiers_matrix[t][0],
cmap="Greys",vmax=0.025);#,
# vmin=0,
# vmax=0.001);
axes[i][1].set_title('Frontier signal - %s'%t);
fig.savefig(domain_fo+'frontier_maps.pdf', transparent=True)
plt.rcParams['legend.numpoints'] = 1
plt.rcParams['figure.dpi'] = 150
# plt.rcParams['figure.figsize'] = 5, 4 # This line is still commented out
plt.rcParams['font.size'] = 8
fig, axes = plt.subplots(figsize=(6, 12), ncols=2, nrows=6, tight_layout=True)
for i, t in enumerate(['EEP', 'HilD', 'MKL82_OFF', 'MKL81_ON', 'MKL42', 'MKL41']):
# Heatmap plot
axes[i][0].imshow(
corrected_counts[t],
norm=colors.LogNorm(vmin=min_positive_counts[t], vmax=1.5*max_counts[t]),
cmap='BuPu'
)
axes[i][0].set_title(f'HiC heat map - {t}')
axes[i][0].set_xlim(3000, 3100)
axes[i][0].set_ylim(3000, 3100)
axes[i][0].invert_yaxis() # Invert the y-axis
# Frontier signal plot
axes[i][1].imshow(
frontiers_matrix[t][0],
cmap="Greys", vmax=0.05
)
axes[i][1].set_title(f'Frontier signal - {t}')
axes[i][1].set_xlim(3000, 3100)
axes[i][1].set_ylim(3000, 3100)
axes[i][1].invert_yaxis() # Invert the y-axis
fig.savefig(domain_fo + 'frontier_maps_SPI-1.pdf', transparent=True)
max_s=25 #to mitigate noise in compute_frontier_indexes (used 80 for the article)
fi_fo=results_fo+'domains_gw=%.1f/fi_max_s=%d/'%(gaussian_width,max_s)
if not os.path.exists(fi_fo): os.mkdir(fi_fo)
frontier_index = {t:compute_frontier_indexes(frontiers_matrix[t],max_s=max_s) for t in frontiers_matrix}
batch_axes=['EEP', 'HilD', 'MKL82_OFF','MKL81_ON', 'MKL41', 'MKL42']
fig, axes = plt.subplots(figsize=(8, 9), nrows=6, tight_layout=True)
for iplot in [0,1,2,3, 4,5]:
loci=[i for i in range(len(frontier_index[batch_axes[iplot]][0]))]
for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
axes[iplot].plot(loci,[frontier_index[batch_axes[iplot]][i][l] for l in loci], '%s+-'%c,label=lab);
axes[iplot].set_title(batch_axes[iplot])
axes[iplot].set_ylim([0,7])
#for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
# axes[1].plot(loci,[frontier_index[batch][i][l] for l in loci], '%s+-'%c,label=lab);
#axes[1].set_xlim([0,200])
#axes[0].set_title(batch)
axes[1].set_xlabel('locus=(HiC) matrix index');
for i in [0, 1,2]:
axes[i].set_ylabel('Frontier index');
axes[i].legend(fontsize="medium");
#fig.savefig(fi_fo+'profiles_CS.pdf', transparent=True)
#identification of all peaks with their value
peaks = {t:[find_peaks(frontier_index[t][i]) for i in range(2)] for t in frontier_index}
#extract peaks values to perform stat analysis
peak_values={t:{i:np.array([peaks[t][i][1][ip] for ip, p in enumerate(peaks[t][i][0])]) for i in [0,1]}
for t in peaks}
mixed_peak_values={t:np.array([peaks[t][0][1][ip] for ip, p in enumerate(peaks[t][0][0])]+
[peaks[t][1][1][ip] for ip, p in enumerate(peaks[t][1][0])])
for t in peaks}
#median (med) and median absolute deviaiton (mad)
#rem: compare to std, mad is a more robust estimate of the "variance of the null model"
med_peaks={t:np.median(mixed_peak_values[t]) for t in mixed_peak_values}
mad_peaks={t:np.median(np.abs(mixed_peak_values[t]-med_peaks[t])) for t in mixed_peak_values}
rho=1.48
k_sigma=2
thresh={t:med_peaks[t]+k_sigma*rho*mad_peaks[t] for t in med_peaks}
print(thresh.keys())
dict_keys(['EEP', 'HilD', 'MKL82_OFF', 'MKL81_ON', 'MKL41', 'MKL42'])
plt.figure(figsize=(6,8),tight_layout=True)
bins=np.arange(0,6,0.03)
for it,batch in enumerate(thresh):
plt.subplot(6,2,it+1)
plt.hist(mixed_peak_values[batch],bins=bins,label=batch);
plt.axvline(med_peaks[batch],color='k',linestyle='dashed')
plt.axvline(thresh[batch],color='r',linestyle='dashed')
plt.legend(loc=1)
signif_fo=fi_fo+'profiles_signif_peaks/'
if not os.path.exists(signif_fo): os.mkdir(signif_fo)
signif_fo_filtered=signif_fo+'filtered_data/'
if not os.path.exists(signif_fo_filtered): os.mkdir(signif_fo_filtered)
batch_axes=list(thresh.keys())
for iplot in range(len(batch_axes)):
fig, axes = plt.subplots(figsize=(8, 3), nrows=2, tight_layout=True)
loci=[i for i in range(len(frontier_index[batch_axes[iplot]][0]))]
mask=[np.zeros(len(loci),dtype=bool),np.zeros(len(loci),dtype=bool)]
for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
for l in [l for l in loci if frontier_index[batch_axes[iplot]][i][l] > thresh[batch_axes[iplot]]]:
mask[i][l]=True
for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
axes[1].plot(loci,(2*i-1)*np.array([frontier_index[batch_axes[iplot]][i][l]
for l in loci])*mask[i],
'%s-'%c,label=lab,lw=4);
axes[0].plot(loci,(2*i-1)*np.array([frontier_index[batch_axes[iplot]][i][l]
for l in loci])*mask[i]*
(mask[np.abs(i-1)]+np.roll(mask[np.abs(i-1)],1)+np.roll(mask[np.abs(i-1)],-1)),
'%s-'%c,label=lab,lw=4);
for i in [0,1]:
#axes[i].set_title(batch_axes[iplot])
axes[i].set_ylim([-5,5])
axes[i].set_xlim([0,len(loci)])
axes[i].set_xticks([])
axes[i].set_yticks([])
#for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
# axes[1].plot(loci,[frontier_index[batch][i][l] for l in loci], '%s+-'%c,label=lab);
#axes[1].set_xlim([0,200])
#axes[0].set_title(batch)
fig.savefig(signif_fo_filtered+'all_and_overlapping_peaks_no_labels_%s.pdf'%batch_axes[iplot], transparent=True)
frontier_index_original_HiC={t:[np.zeros(len(counts[t])) for _ in [0,1]] for t in frontier_index}
for t in frontier_index_original_HiC:
for ix_f in [0,1]:
for l in range(len(frontier_index[t][0])):
new_l=new2ref_idx[t][l]
frontier_index_original_HiC[t][ix_f][new_l]=frontier_index[t][ix_f][l]
signif_fo_original=signif_fo+'original_data/'
if not os.path.exists(signif_fo_original): os.mkdir(signif_fo_original)
batch_axes=list(thresh.keys())
for iplot in range(len(batch_axes)):
fig, axes = plt.subplots(figsize=(8, 3), nrows=2, tight_layout=True)
loci=[i for i in range(len(frontier_index_original_HiC[batch_axes[iplot]][0]))]
mask=[np.zeros(len(loci),dtype=bool),np.zeros(len(loci),dtype=bool)]
for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
for l in [l for l in loci if frontier_index_original_HiC[batch_axes[iplot]][i][l] > thresh[batch_axes[iplot]]]:
mask[i][l]=True
for i, (c, lab) in enumerate(zip(['C1','C2'], ['upstream', 'downstream'])):
axes[1].plot(loci,(2*i-1)*np.array([frontier_index_original_HiC[batch_axes[iplot]][i][l]
for l in loci])*mask[i],
'%s-'%c,label=lab,lw=4);
axes[0].plot(loci,(2*i-1)*np.array([frontier_index_original_HiC[batch_axes[iplot]][i][l]
for l in loci])*mask[i]*
(mask[np.abs(i-1)]+np.roll(mask[np.abs(i-1)],1)+np.roll(mask[np.abs(i-1)],-1)),
'%s-'%c,label=lab,lw=4);
for i in [0,1]:
#axes[i].set_title(batch_axes[iplot])
axes[i].set_ylim([-5,5])
axes[i].set_xlim([0,len(loci)])
axes[i].set_xticks([])
axes[i].set_yticks([])
fig.savefig(signif_fo_original+'all_and_overlapping_peaks_no_labels_%s.pdf'%batch_axes[iplot], transparent=True)
signif_fo_original_data=signif_fo_original+'peaks_location/'
if not os.path.exists(signif_fo_original_data): os.mkdir(signif_fo_original_data)
batch_axes=list(thresh.keys())
for iplot in range(len(batch_axes)):
loci=[i for i in range(len(frontier_index_original_HiC[batch_axes[iplot]][0]))]
mask=[np.zeros(len(loci),dtype=bool),np.zeros(len(loci),dtype=bool)]
for i, lab in enumerate(['upstream', 'downstream']):
#for l in [l for l in loci if frontier_index_original_HiC[batch_axes[iplot]][i][l] > thresh[batch_axes[iplot]] ]:
#for l in [l for l in loci]:
for l in [new2ref_idx[batch_axes[iplot]][p] for p in peaks[batch_axes[iplot]][i][0] if frontier_index[batch_axes[iplot]][i][p] > thresh[batch_axes[iplot]]]:
#for l in [new2ref_idx[batch_axes[iplot]][p] for p in peaks[batch_axes[iplot]][i][0]]:
mask[i][l]=True
with open(signif_fo_original_data+'%s_peaks_%s.txt'%(lab,batch_axes[iplot]),'w') as out:
out.write('#bin\tFI\n')
for loc in [loc for loc in loci if mask[i][loc]]:
out.write('%d\t%.2f\n'%(loc+1,frontier_index_original_HiC[batch_axes[iplot]][i][loc]))